Данные об университетах.
Признаки:
uni_data <- read.csv2("I.csv")
Покажем, что данные неоднородны. Посмотрим, например, на скаттерплот IN_STATE vs INSTRUCT.
plot(uni_data$IN_STATE, uni_data$INSTRUCT, main = "Grouped by PPIND", xlab = "IN_STATE", ylab = "INSTRUCT", bg=c("red","blue")[uni_data$PPIND],
pch=21+as.numeric(uni_data$PPIND))
Далее будем рассматривать только государственные университеты (PPIND == 1)
Признаков очень много. Возможно, стоит разбить количественные признаки на группы и выбрать какие-нибудь представители от них.
Разобьем, например, так:
## $SAT_ACT_results
## [1] "AVRMATH" "AVRVERB" "AVRCOMB" "AVR_ACT" "MATH_1" "MATH_3" "VERB_1" "VERB_3" "ACT_1" "ACT_3"
##
## $Costs
## [1] "R_B_COST" "ROOM" "BOARD" "ADD_FEE" "BOOK" "PERSONAL"
##
## $Benefits
## [1] "IN_STATE" "OUT_STAT" "INSTRUCT"
##
## $Application
## [1] "APP_REC" "APP_ACC" "NEW_STUD" "NEW10" "NEW25"
##
## $Quality
## [1] "SF_RATIO" "PH_D" "TERM_D" "DONATE"
##
## $Students
## [1] "FULLTIME" "PARTTIME" "GRADUAT"
##
## $Salaries
## [1] "SAL_FULL" "SAL_AC" "SAL_AS" "SAL_ALL"
##
## $NumbersOfPeople
## [1] "NUM_FULL" "NUM_AC" "NUM_AS" "NUM_ALL"
##
## $Comp
## [1] "COMP_FUL" "COMP_AC" "COMP_AS" "COMP_ALL"
Построим matrix plot’ы.
uni_data_public <- uni_data[uni_data$PPIND == 1,]
groupsNames <- names(types)
mtp <- lapply(groupsNames, function(group){
pairs.panels(uni_data_public[, types[[group]]],
lm=TRUE,
ellipses = FALSE,
main = group)
})
Прологорифмируем все кроме SF_RATIO, GRADUAT, PH_D, TERM_D и попробуем еще раз. Кроме того, выпишем количество пропусков.
uni_data_log <- uni_data_public
uni_data_log[,c(-1,-2,-3,-4,-5, -31,-32, -33, -36)] <- apply(uni_data_log[,c(-1,-2,-3,-4,-5,-31,-32, -33, -36)], 2, log)
mtp <- lapply(types, function(cols){
pairs.panels(uni_data_log[,cols],
lm=TRUE,
ellipses = FALSE)
lst <- lapply(uni_data_log[, cols], function(col){
length(col[is.na(col)])
})
print(lst)
})
## $AVRMATH
## [1] 43
##
## $AVRVERB
## [1] 43
##
## $AVRCOMB
## [1] 43
##
## $AVR_ACT
## [1] 49
##
## $MATH_1
## [1] 32
##
## $MATH_3
## [1] 32
##
## $VERB_1
## [1] 32
##
## $VERB_3
## [1] 32
##
## $ACT_1
## [1] 45
##
## $ACT_3
## [1] 45
## $R_B_COST
## [1] 0
##
## $ROOM
## [1] 34
##
## $BOARD
## [1] 54
##
## $ADD_FEE
## [1] 33
##
## $BOOK
## [1] 2
##
## $PERSONAL
## [1] 10
## $IN_STATE
## [1] 7
##
## $OUT_STAT
## [1] 1
##
## $INSTRUCT
## [1] 0
## $APP_REC
## [1] 1
##
## $APP_ACC
## [1] 1
##
## $NEW_STUD
## [1] 0
##
## $NEW10
## [1] 15
##
## $NEW25
## [1] 22
## $SF_RATIO
## [1] 0
##
## $PH_D
## [1] 5
##
## $TERM_D
## [1] 11
##
## $DONATE
## [1] 12
## $FULLTIME
## [1] 0
##
## $PARTTIME
## [1] 0
##
## $GRADUAT
## [1] 4
## $SAL_FULL
## [1] 0
##
## $SAL_AC
## [1] 0
##
## $SAL_AS
## [1] 0
##
## $SAL_ALL
## [1] 0
## $NUM_FULL
## [1] 0
##
## $NUM_AC
## [1] 0
##
## $NUM_AS
## [1] 0
##
## $NUM_ALL
## [1] 0
## $COMP_FUL
## [1] 0
##
## $COMP_AC
## [1] 0
##
## $COMP_AS
## [1] 0
##
## $COMP_ALL
## [1] 0
В качестве основных признаков выберем те, что больше всего коррелируют с остальными в группе и содержат меньше всего пропусков. Выберем основные признаки и построим matrix plot’ы для них.
types$Primary = c("NEW10", "R_B_COST","ADD_FEE",
"BOOK", "SAL_ALL", "NUM_ALL" ,"APP_ACC", "PH_D")
uni_data_log <- na.omit(uni_data_log[,types$Primary])
uni_data_log <- as.data.frame(scale(uni_data_log[,types$Primary]))
mtp <- pairs.panels(uni_data_log[,types$Primary],
lm=TRUE,
ellipses = FALSE)
Shapiro-Wilks Test до логарифмирования
print(lapply(uni_data_public[,types$Primary], function(x) { shapiro.test(x)$p.value}))
## $NEW10
## [1] 9.678601e-11
##
## $R_B_COST
## [1] 0.0008010331
##
## $ADD_FEE
## [1] 1.457477e-14
##
## $BOOK
## [1] 9.108105e-05
##
## $SAL_ALL
## [1] 0.3888914
##
## $NUM_ALL
## [1] 3.875619e-05
##
## $APP_ACC
## [1] 5.484138e-08
##
## $PH_D
## [1] 0.1242419
Shapiro-Wilks Test после логарифмирования
print(lapply(uni_data_log[,types$Primary], function(x) { shapiro.test(x)$p.value}))
## $NEW10
## [1] 0.03108544
##
## $R_B_COST
## [1] 0.7211772
##
## $ADD_FEE
## [1] 0.01164912
##
## $BOOK
## [1] 0.1592939
##
## $SAL_ALL
## [1] 0.5915829
##
## $NUM_ALL
## [1] 0.01269193
##
## $APP_ACC
## [1] 0.01923453
##
## $PH_D
## [1] 0.1157398
Признаки стали ближе к нормальным.
Посмотрим на регрессию
l <- lm(NEW10 ~ R_B_COST + ADD_FEE + BOOK
+ SAL_ALL + NUM_ALL + APP_ACC + PH_D, uni_data_log)
summary(l)
##
## Call:
## lm(formula = NEW10 ~ R_B_COST + ADD_FEE + BOOK + SAL_ALL + NUM_ALL +
## APP_ACC + PH_D, data = uni_data_log)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.15731 -0.50506 0.03888 0.48687 1.70733
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.194e-16 9.578e-02 0.000 1.0000
## R_B_COST 7.819e-02 1.236e-01 0.632 0.5292
## ADD_FEE 1.825e-01 1.104e-01 1.653 0.1029
## BOOK 1.795e-01 1.020e-01 1.761 0.0827 .
## SAL_ALL 2.269e-01 1.397e-01 1.624 0.1090
## NUM_ALL 8.922e-02 1.717e-01 0.519 0.6051
## APP_ACC -8.486e-02 1.731e-01 -0.490 0.6255
## PH_D 2.822e-01 1.200e-01 2.353 0.0215 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## s: 0.8405 on 69 degrees of freedom
## Multiple R-squared: 0.3587,
## Adjusted R-squared: 0.2936
## F-statistic: 5.512 on 7 and 69 DF, p-value: 4.602e-05
Регрессия значима.
red <- redun(~ R_B_COST + ADD_FEE + BOOK
+ SAL_ALL + NUM_ALL + APP_ACC + PH_D,
uni_data_log, type = "ordinary", r2 = 0.0000001)$rsq1
pcors <- pcor(na.omit(uni_data_log[types$Primary]))$estimate[1,-1]
semi.pcors <- spcor(na.omit(uni_data_log[types$Primary]))$estimate[1,-1]
redundancy <- data.frame(cbind(red, 1 - red, pcors,semi.pcors),
row.names = types$Primary[-1])
colnames(redundancy) <- c("Multiple R^2", "Tolerance", "Partial cor", "Semi-partial cor")
print(redundancy)
## Multiple R^2 Tolerance Partial cor Semi-partial cor
## R_B_COST 0.4416787 0.5583213 0.07591831 0.06097414
## ADD_FEE 0.3131245 0.6868755 0.19517176 0.15936545
## BOOK 0.1704369 0.8295631 0.20734853 0.16974127
## SAL_ALL 0.6340405 0.3659595 0.19186325 0.15655974
## NUM_ALL 0.7408881 0.2591119 0.06241516 0.05008199
## APP_ACC 0.7496509 0.2503491 -0.05891074 -0.04725996
## PH_D 0.4539229 0.5460771 0.27250154 0.22681288
Удалим APP_ACC как признак с самым большим \(R^2\) и самым маленьким pcor. Посмотрим, что изменилось.
types$Primary <- types$Primary[-8]
l <- lm(NEW10 ~ R_B_COST + + ADD_FEE + BOOK + SAL_ALL + NUM_ALL + PH_D, uni_data_log)
summary(l)
##
## Call:
## lm(formula = NEW10 ~ R_B_COST + +ADD_FEE + BOOK + SAL_ALL + NUM_ALL +
## PH_D, data = uni_data_log)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.12378 -0.51621 0.05527 0.49049 1.69568
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.483e-16 9.526e-02 0.000 1.0000
## R_B_COST 5.789e-02 1.159e-01 0.500 0.6189
## ADD_FEE 1.639e-01 1.031e-01 1.589 0.1165
## BOOK 1.764e-01 1.012e-01 1.743 0.0857 .
## SAL_ALL 2.203e-01 1.383e-01 1.593 0.1157
## NUM_ALL 2.641e-02 1.137e-01 0.232 0.8171
## PH_D 2.901e-01 1.182e-01 2.454 0.0166 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## s: 0.8359 on 70 degrees of freedom
## Multiple R-squared: 0.3564,
## Adjusted R-squared: 0.3013
## F-statistic: 6.461 on 6 and 70 DF, p-value: 1.87e-05
Почти не изменился \(R^2\) (был 0.3587, стал 0.3564). Значимость регрессии стала лучше (p-value: 4.602e-05 vs 1.87e-05).
Пошаговая регрессия
forward(l,0.99999)
## Forward selection, alpha-to-enter: 0.99999
##
## Full model: NEW10 ~ R_B_COST + +ADD_FEE + BOOK + SAL_ALL + NUM_ALL + PH_D
## <environment: 0x7fd596716d80>
##
## Step RSS AIC R2pred Cp F value Pr(>F)
## PH_D 1 56.773 -19.466 0.21262 8.2505 25.4004 3.132e-06 ***
## SAL_ALL 2 52.753 -23.120 0.25147 4.4979 5.6385 0.02016 *
## BOOK 3 51.114 -23.551 0.26049 4.1515 2.3416 0.13028
## ADD_FEE 4 49.105 -24.638 0.25973 3.2769 2.9451 0.09044 .
## R_B_COST 5 48.949 -22.883 0.23350 5.0539 0.2260 0.63599
## NUM_ALL 6 48.912 -20.942 0.21875 7.0000 0.0539 0.81709
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Call:
## lm(formula = NEW10 ~ PH_D + SAL_ALL + BOOK + ADD_FEE + R_B_COST +
## NUM_ALL, data = uni_data_log)
##
## Coefficients:
## (Intercept) PH_D SAL_ALL BOOK ADD_FEE R_B_COST NUM_ALL
## -8.483e-16 2.901e-01 2.203e-01 1.764e-01 1.639e-01 5.789e-02 2.641e-02
В случае forward regression минимум AIC достигается при добавлении четырех признаков. Заметим, что после добавления SAL_ALL разница в значениях AIC маленькая и может быть объяснена случайностью.
backward(l,1e-9)
## Backward elimination, alpha-to-remove: 1e-09
##
## Full model: NEW10 ~ R_B_COST + +ADD_FEE + BOOK + SAL_ALL + NUM_ALL + PH_D
## <environment: 0x7fd599050340>
##
## Step RSS AIC R2pred Cp F value Pr(>F)
## NUM_ALL 1 48.949 -22.88273 0.233497 5.0539 0.0539 0.81709
## R_B_COST 2 49.105 -24.63805 0.259734 3.2769 0.2260 0.63599
## ADD_FEE 3 51.114 -23.55113 0.260491 4.1515 2.9451 0.09044 .
## BOOK 4 52.753 -23.12003 0.251473 4.4979 2.3416 0.13028
## SAL_ALL 5 56.773 -19.46577 0.212623 8.2505 5.6385 0.02016 *
## PH_D 6 76.000 0.99345 -0.026489 33.7678 25.4004 3.132e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Call:
## lm(formula = NEW10 ~ 1, data = uni_data_log)
##
## Coefficients:
## (Intercept)
## 3.416e-16
В backward regression минимум AIC достигается на четвертом шаге, но здесь снова очень маленькая разница между значениями критерия.
Оставим только признаки PH_D и SAL_ALL.
types$Primary <- c("NEW10", "PH_D", "SAL_ALL")
Аутлаеры по Махаланобису
m <- sort(mahalanobis(uni_data_log[types$Primary[-1]], colMeans(uni_data_log[types$Primary[-1]]), cov(uni_data_log[types$Primary[-1]])))
plot(m, main = "Machalanobis distance", xlab = "Individual", ylab = "distance", ylim = c(0,(max(m) + 1)))
text(m, labels = names(m), cex= 0.7, pos=3)
out <- c("112")
uni_data_log <- uni_data_log[!rownames(uni_data_log) %in% out,]
uni_data[out,c("X", types$Primary[-1])]
## X PH_D SAL_ALL
## 112 University of North 97 379
Аутлаер – государственный университет, где самый высокий процент PhD.
Аутлаеры по Куку.
cook <- sort(cooks.distance(l))
plot( cook, main = "Cook's distance", xlab = "Individual", ylab = "distance", ylim = c(0,0.2))
abline(h = 4/(nrow(uni_data_log) - length(types$Primary) + 2), col = "red")
text(cook, labels = names(cook), cex= 0.7, pos=3)
out <- c("37", "74")
print(uni_data[out,c("X", types$Primary[-1])])
## X PH_D SAL_ALL
## 37 Georgia State Univer 87 502
## 74 University of Massac 88 607
uni_data_log <- uni_data_log[!rownames(uni_data_log) %in% out,]
Аутлаеры – государственные университеты, где высокий процент PhD и при этом большие зарплаты.
Посмотрим на нормальность остатков.
l <- lm(NEW10 ~ PH_D + SAL_ALL, uni_data_log)
summary(l)
##
## Call:
## lm(formula = NEW10 ~ PH_D + SAL_ALL, data = uni_data_log)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.81492 -0.45477 -0.02579 0.46737 1.49861
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.06574 0.09076 0.724 0.471256
## PH_D 0.39358 0.11396 3.454 0.000936 ***
## SAL_ALL 0.27423 0.11782 2.328 0.022791 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## s: 0.7798 on 71 degrees of freedom
## Multiple R-squared: 0.3717,
## Adjusted R-squared: 0.354
## F-statistic: 21 on 2 and 71 DF, p-value: 6.846e-08
plot(l, which = 1)
shapiro.test(l$residuals)
##
## Shapiro-Wilk normality test
##
## data: l$residuals
## W = 0.98245, p-value = 0.3949
Normal Probability Plot
ef <- ecdf(l$residuals)
plot(l$residuals, qnorm(ef(l$residuals)), xlab = "Residuals", ylab = "Sample Quantiles", main = "Normal Probability Plot")
abline(a = 0, b = 1)
Residual analysis
df <- na.omit(uni_data_log[,types$Primary])
resid.list <- sapply(1:nrow(df), function(i){
LM <- lm(NEW10 ~ PH_D + SAL_ALL, df)
resid.with.i <- df[i,"NEW10"] - as.numeric(LM$coefficients) %*% c(1,as.numeric(df[i,-1]))
LM <- lm(NEW10 ~ PH_D + SAL_ALL, df[-i,])
resid.without.i <- df[i,"NEW10"] - as.numeric(LM$coefficients) %*% c(1,as.numeric(df[i,-1]))
return(c(resid.with.i, resid.without.i))
})
plot(resid.list[1,], resid.list[2,],
main = "Residuals vs Deleted Residuals",
xlab = "Residuals",
ylab = "Deleted Residuals")
abline(a = 0, b =1, col = "red")
Видно, что Residuals \(\leq\) Deleted Residuals.